Here we simulate from the true model, with the following parameters.
For each simulated data, we fit a ZINB model with all the possible combinations (16) of the following parameters:
See bias, variance, and MSE in separate Rmd files (simAllen_estimates_25.Rmd, simAllen_estimates_5.Rmd, and simAllen_estimates_75.Rmd).
par(mar=c(10.1,4.1,4.1,2.1))
ds = 'Allen'
for (nc in c(100, 1000)){
for (offs in c(-3.5, 0, 3.5)){
for (aa in c(1, .85)){
pref = sprintf('datasets/sim%s_%s_a%s_offs%s_seed9128',
ds, nc, aa, offs)
load(paste0(pref, '_dist.rda'))
corr <- lapply(1:15, function(i) rowMeans(sapply(res, function(x) x[[i]])))
corr = do.call(cbind, corr)
corr = data.frame(corr, stringsAsFactors = F)
colnames(corr) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
"PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC',
'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')
main = paste0("Correlation between true and estimated sample distances in W\n", nc, " cells, offset ", offs, ", clustering ", aa)
boxplot(corr, main = main, border = c(3, 1, 2, rep(1,12)), xlab = "",
ylab = "Correlation", las = 2)
abline(h = 1, lty = 2)
abline(v = 5.5, lty = 2)
abline(v = 1.5, lty = 2)
abline(v = 10.5, lty = 2)
}
}
}
corr = lapply(c(100, 1000), function(nc){
lapply(c(1, .85), function(aa){
lapply(c(-3.5, 0, 3.5), function(offs){
pp = sprintf('datasets/simAllen_%s_a%s_offs%s_seed9128', nc, aa, offs)
load(paste0(pp, '_dist.rda'))
cc <- lapply(1:15, function(i) rowMeans(sapply(res, function(x) x[[i]])))
do.call(cbind, cc)
})
})
})
corr = do.call(rbind, do.call(rbind, do.call(rbind, corr)))
corr = data.frame(corr, stringsAsFactors = F)
colnames(corr) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
"PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC',
'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')
corr$pzero = c(rep(rep(c(.25,.45,.75), each = 100), 2),
rep(rep(c(.25,.45,.75), each = 1000), 2))
corr$var = c(rep(paste('Clustering', 1:2), each = 300),
rep(paste('Clustering', 1:2), each = 3000))
corr$nc = c(rep('100', 600), rep('1000', 6000))
corrMolten = melt(corr, id.vars = c('pzero', 'var', 'nc'))
corrSum = corrMolten %>% group_by(pzero, var, variable, nc) %>%
summarize(mean = mean(value), sd = sd(value)) %>% ungroup() %>%
as.data.frame()
corrSum$pzero = factor(corrSum$pzero)
c1 = corrSum[corrSum$variable != 'true W',]
c1$method = sapply(strsplit(as.vector(c1$variable), ' '), '[[', 1)
allen = ggplot(c1, aes(x = pzero, y = mean, col = variable, group = variable)) +
geom_point() + geom_line() + labs(col='') +
theme_bw() + xlab('Zero Fraction') + facet_grid(nc ~ var) +
ylab('Correlation between true and estimated sample distances in W')
allen
#ggsave(filename="paper/6680489mtyrjx/allen_correlations.pdf", plot = allen,
# width = 5, height = 5)
for (nc in c(100, 1000)){
for (offs in c(-3.5, 0, 3.5)){
for (aa in c(1, .85)){
pref = sprintf('datasets/sim%s_%s_a%s_offs%s_seed9128',
ds, nc, aa, offs)
load(paste0(pref, '_dist.rda'))
sil <- lapply(16:length(res[[1]]), function(i){
rowMeans(sapply(res, function(x) x[[i]]))
})
sil = do.call(cbind, sil)
sil = data.frame(sil,stringsAsFactors = F)
colnames(sil) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
"PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC',
'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')
main = paste0("Silhouette width of true labels\n", nc, " cells, offset ", offs, ", clustering ", aa)
boxplot(sil, main = main, border=c(3, 1, 2, rep(1,10)), xlab="",
ylab="Silhouette width", las=2)
boxplot(sil - sil[, 1], main = paste0("Difference of ", main),
border = c(3, 1, 2, rep(1,12)), xlab = "", las = 2,
ylab = "Silhouette width - True Silhouette width")
abline(h = 0, lty = 2)
}
}
}
sil = lapply(c(100, 1000), function(nc){
lapply(c(1, .85), function(aa){
lapply(c(-3.5, 0, 3.5), function(offs){
pp = sprintf('datasets/simAllen_%s_a%s_offs%s_seed9128', nc, aa, offs)
load(paste0(pp, '_dist.rda'))
ss <- lapply(16:length(res[[1]]), function(i){
rowMeans(sapply(res, function(x) x[[i]]))
})
do.call(cbind, ss)
})
})
})
sil = do.call(rbind, do.call(rbind, do.call(rbind, sil)))
sil = data.frame(sil, stringsAsFactors = F)
colnames(sil) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
"PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC',
'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')
sil = sil - sil[,1]
sil$pzero = c(rep(rep(c(.25,.45,.75), each = 100), 2),
rep(rep(c(.25,.45,.75), each = 1000), 2))
sil$var = c(rep(paste('Clustering', 1:2), each = 300),
rep(paste('Clustering', 1:2), each = 3000))
sil$nc = c(rep('100', 600), rep('1000', 6000))
silMolten = melt(sil, id.vars = c('pzero', 'var', 'nc'))
silSum = silMolten %>% group_by(pzero, var, variable, nc) %>%
summarize(mean = mean(value), sd = sd(value)) %>% ungroup() %>%
as.data.frame()
silSum$pzero = factor(silSum$pzero)
c1 = silSum[silSum$variable != 'true W',]
c1$method = sapply(strsplit(as.vector(c1$variable), ' '), '[[', 1)
allen = ggplot(c1, aes(x = pzero, y = mean, col = variable, group = variable)) +
geom_point() + geom_line() + labs(col='') +
theme_bw() + xlab('Zero Fraction') + facet_grid(nc ~ var) +
ylab('Silhouette width - True Silhouette width') +
geom_hline(yintercept = 0, col = 'gray')
allen
#ggsave(filename="paper/6680489mtyrjx/allen_correlations.pdf", plot = allen,
# width = 5, height = 5)
For one simulated dataset with the right parameters (Vintercept, genewise dispersion, K= 2).
par(mfrow = c(3,4))
pref = "datasets/simAllen_100_a1_offs-3.5_seed9128"
load(paste0(pref, ".rda"))
load(paste0(pref, "_zifaFQ.rda"))
load(paste0(pref, "_fitted.rda"))
biotrue = factor(bio)
fit = fittedSim[[2]][[1]]
counts = t(simData[[1]]$counts)
counts = counts[rowSums(counts) != 0, ]
fq <- betweenLaneNormalization(counts, which="full")
pca <- prcomp(t(log1p(fq)))
par(mfrow=c(3,4))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.25',
xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.25',
xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA FQ\nfZero = 0.25')
plot(zifaFQ[[1]], col = biotrue, main = 'ZIFA FQ\nfZero = 0.25')
pref = "datasets/simAllen_100_a1_offs0_seed9128"
load(paste0(pref, ".rda"))
load(paste0(pref, "_zifaTC.rda"))
load(paste0(pref, "_fitted.rda"))
biotrue = factor(bio)
fit = fittedSim[[2]][[1]]
counts = t(simData[[1]]$counts)
counts = counts[rowSums(counts) != 0, ]
mult = sum(counts) / (ncol(counts) * nrow(counts))
fact = colSums(counts)
tc = mult * (t(counts) / fact)
pca <- prcomp(log1p(tc))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.45',
xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.45',
xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA TC\nfZero = 0.45')
plot(zifaTC[[1]], col = biotrue, main = 'ZIFA TC\nfZero = 0.45')
pref = "datasets/simAllen_100_a1_offs3.5_seed9128"
load(paste0(pref, ".rda"))
load(paste0(pref, "_zifaTC.rda"))
load(paste0(pref, "_fitted.rda"))
biotrue = factor(bio)
fit = fittedSim[[2]][[1]]
counts = t(simData[[1]]$counts)
counts = counts[rowSums(counts) != 0, ]
mult = sum(counts) / (ncol(counts) * nrow(counts))
fact = colSums(counts)
tc = mult * (t(counts) / fact)
pca <- prcomp(log1p(tc))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.75',
xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.75',
xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA TC\nfZero = 0.75')
plot(zifaTC[[1]], col = biotrue, main = 'ZIFA TC\nfZero = 0.75')
Let’s look at model fit when chosen parameters are correct (V, genewise dispersion, K=2).
From Sandrine’s slide.
Mean, variance, and zero probability for the ZINB model are \[E[Y_{i,j}] = (1 - \pi_{i,j})\mu_{i,j},\] \[var(Y_{i,j}) = (1 - \pi_{i,j})\mu_{i,j}(1 + \mu_{i,j}(\phi_j + \pi_{i,j})),\] \[P(Y_{i,j} = 0) = \pi_{i,j} + (1 - \pi_{i,j})(1 + \phi_j \mu_{i,j})^{\frac{1}{\phi_j}}.\]
For the NB model, \(\pi_{i,j}=0\). We fitted the NB using edgeR after full quantile normalization.
computeExp <- function(zinbModel){
(1 - t(getPi(zinbModel))) * t(getMu(zinbModel))
}
computeVar <- function(zinbModel){
mu = t(getMu(zinbModel))
pi = t(getPi(zinbModel))
phi = exp(-getZeta(zinbModel))
(1 - pi) * mu * (1 + mu*(phi + pi))
}
computeP0 <- function(zinbModel){
mu = t(getMu(zinbModel))
pi = t(getPi(zinbModel))
phi = exp(-getZeta(zinbModel))
pi + (1 - pi) * (1 + phi * mu) ^ (-1/phi)
}
plotMD <- function(x, y, xlim = c(0,10), ylim = c(-5, 5),
main = 'ZINB: MD-plot estimated vs. observed mean count, log scale'){
mm = .5*(x + y)
dd = x - y
smoothScatter(mm, dd, xlim = xlim, ylim = ylim, xlab = 'Mean', ylab = 'Diff', main = main)
abline(h = 0, col = 'gray')
fit = loess(dd ~ mm)
xpred = seq(0, 10, .1)
pred = predict(fit, xpred)
lines(xpred, pred, col = 'red', type = 'l', lwd=2)
}
fitNB <- function(rda, i = 1){
load(rda)
counts = t(simData[[i]]$counts)
phenoData = data.frame(bio)
set = newSeqExpressionSet(counts, phenoData = phenoData)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
fit = glmFit(counts(fq), dispersion = disp$tagwise.dispersion, offset = -offst(fq))
list(fitted = fit$fitted.values, disp = disp$tagwise.dispersion)
}
There is one outlier for zinb fit.
pp = 'datasets/simAllen_1000_a1_offs0_seed9128'
# zinb
load(paste0(pp, '_fitted.rda'))
load(paste0(pp, '.rda'))
zz = fittedSim[[2]][[1]]
zinbEY = rowMeans(log1p(computeExp(zz)))
zinbPY0 = rowMeans(computeP0(zz))
# observed
counts = t(simData[[1]]$counts)
logAveCount = rowMeans(log1p(counts))
prop0 = rowMeans(counts == 0)
# edgeR
nb25 = fitNB(paste0(pp, '.rda'))
## Design matrix not provided. Switch to the classic mode.
nbEY = rowMeans(log1p(nb25$fitted))
nbPY0 = rowMeans((1 + nb25$fitted * nb25$disp)^(-1/nb25$disp))
MD-plot estimated vs. observed mean count, log scale, zoomed (we do not see the outlier)
par(mfrow=c(1,2))
plotMD(logAveCount, zinbEY, xlim = c(0, 7), ylim = c(-6, 6),
main = 'ZINB')
plotMD(logAveCount, nbEY, xlim = c(0, 7), ylim = c(-6, 6),
main = 'NB, edgeR')
MD-plot estimated vs. observed zero probability
par(mfrow=c(1,2))
plotMD(prop0, zinbPY0, xlim = c(0, 1), ylim = c(-1, 1),
main = 'ZINB')
plotMD(prop0, nbPY0, xlim = c(0, 1), ylim = c(-1, 1),
main = 'NB, edgeR')
par(mfrow=c(1,1))
smoothScatter(logAveCount, rowMeans(counts == 0),
xlab = 'log average count', xlim = c(0,7),
ylab = 'proportion of zeros', ylim = c(0,1),
main = 'Zero probability versus Mean')
fit = loess(zinbPY0 ~ zinbEY)
xpred = seq(0,9,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col='green',type = 'l', lwd=2, ylim = c(0,1), xlim = c(0,7))
fit = loess(nbPY0 ~ nbEY)
xpred = seq(0,9,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col='red', type='l', lwd=2, ylim = c(0,1), xlim = c(0,7))
fit = loess(rowMeans(counts == 0) ~ logAveCount)
xpred = seq(0,9,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col = 'blue',type = 'l',lwd=2, ylim = c(0,1), xlim = c(0,7))
legend('topright', c('observed', 'zinb', 'nb, edgeR'),
lty=1, col=c('blue', 'green', 'red'), bty='n', cex=.75)
par(mfrow = c(1, 2))
smoothScatter(exp(-simModel@zeta),exp(-zz@zeta),
xlab = 'True Dispersion', ylab = 'Estimated Dispersion',
ylim = c(0, 10),xlim = c(0,10),
main = 'ZINB')
abline(a = 0, b = 1, col = 'gray')
smoothScatter(exp(-simModel@zeta), nb25$disp,
xlab = 'True Dispersion', ylab = 'Estimated Dispersion',
ylim = c(0, 10),xlim = c(0,10),
main = 'NB, edgeR')
abline(a = 0, b = 1, col = 'gray')
par(mfrow = c(1, 1))
xpred = seq(0, 1, .05)
fitzinb = loess(exp(-zz@zeta) ~ rowMeans(counts == 0))
predzinb = predict(fitzinb, xpred)
fitnb = loess(nb25$disp ~ rowMeans(counts == 0))
prednb = predict(fitnb, xpred)
smoothScatter(rowMeans(counts == 0), exp(-simModel@zeta),
xlab = 'Observed zero probability',ylim = c(0, 15),
ylab = 'True dispersion',xlim = c(0,1),
main = 'True Dispersion versus observed zero probability')
lines(xpred, predzinb, col = 'green', type = 'l', lwd=2)
lines(xpred, prednb, col = 'red', type = 'l', lwd=2)
legend('topright',c('zinb','nb, edgeR'),lty=1,col=c('green','red'), bty='n', cex=.75)